In [45]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from dynaconf import Dynaconf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score, accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.inspection import PartialDependenceDisplay
from sklearn.feature_selection import SelectKBest, f_regression, RFE
import matplotlib.pyplot as plt
import seaborn as sns
import math
import shap
import joblib
In [2]:
#Load matchup_feature data
settings = Dynaconf(envvar_prefix='MYAPP', load_dotenv=True)
DB_URL = settings.DB_ENGINE_URL
engine = create_engine(DB_URL)
with engine.begin() as conn:
matchup_df = pd.read_sql('SELECT * FROM matchup_features', conn)
In [4]:
#Clean/Explore
matchup_df.head()
Out[4]:
| game_id | season | week | elo_diff | points_per_game_diff | points_allowed_per_game_diff | recent_points_per_game_diff | recent_points_allowed_per_game_diff | margin_of_victory_diff | win_rate_diff | ... | yards_allowed_per_play_diff | explosiveness_diff | success_rate_diff | travel_distance | rest_days_diff | recent_form_diff | neutral_site | vegas_spread_close | vegas_over_under_close | home_win | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 400603827 | 2015 | 1 | 28.86 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | 687.645 | -9.223370e+18 | NaN | 1 | -12.0 | 44.0 | 1 |
| 1 | 400603828 | 2015 | 1 | 29.71 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | 767.009 | -9.223370e+18 | NaN | 0 | -32.5 | 49.5 | 1 |
| 2 | 400603829 | 2015 | 1 | 19.42 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | 386.583 | -9.223370e+18 | NaN | 1 | -10.5 | 54.0 | 1 |
| 3 | 400603830 | 2015 | 1 | 20.00 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | 1456.180 | -9.223370e+18 | NaN | 0 | -34.0 | 54.0 | 1 |
| 4 | 400603831 | 2015 | 1 | 29.71 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | 512.644 | -9.223370e+18 | NaN | 0 | -35.0 | 57.0 | 1 |
5 rows × 21 columns
In [94]:
matchup_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7723 entries, 0 to 7722 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 game_id 7723 non-null int64 1 season 7723 non-null int64 2 week 7723 non-null int64 3 elo_diff 7723 non-null float64 4 points_per_game_diff 6598 non-null float64 5 points_allowed_per_game_diff 6598 non-null float64 6 recent_points_per_game_diff 6598 non-null float64 7 recent_points_allowed_per_game_diff 6598 non-null float64 8 margin_of_victory_diff 6598 non-null float64 9 win_rate_diff 6598 non-null float64 10 yards_per_play_diff 6497 non-null float64 11 yards_allowed_per_play_diff 6497 non-null float64 12 explosiveness_diff 6497 non-null float64 13 success_rate_diff 6497 non-null float64 14 travel_distance 7723 non-null float64 15 rest_days_diff 7723 non-null float64 16 recent_form_diff 6598 non-null float64 17 neutral_site 7723 non-null int64 18 vegas_spread_close 7692 non-null float64 19 vegas_over_under_close 7648 non-null float64 20 home_win 7723 non-null int64 dtypes: float64(16), int64(5) memory usage: 1.2 MB
In [5]:
matchup_df.describe()
Out[5]:
| game_id | season | week | elo_diff | points_per_game_diff | points_allowed_per_game_diff | recent_points_per_game_diff | recent_points_allowed_per_game_diff | margin_of_victory_diff | win_rate_diff | ... | yards_allowed_per_play_diff | explosiveness_diff | success_rate_diff | travel_distance | rest_days_diff | recent_form_diff | neutral_site | vegas_spread_close | vegas_over_under_close | home_win | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7.723000e+03 | 7723.000000 | 7723.000000 | 7723.000000 | 6598.000000 | 6598.000000 | 6598.000000 | 6598.000000 | 6598.000000 | 6598.000000 | ... | 6497.000000 | 6497.000000 | 6497.000000 | 7723.000000 | 7.723000e+03 | 6598.000000 | 7723.000000 | 7692.000000 | 7648.000000 | 7723.000000 |
| mean | 4.011980e+08 | 2019.667487 | 7.181536 | 9.373491 | 0.080466 | -0.356659 | -0.109427 | -0.233859 | 0.437125 | 0.006678 | ... | -0.024634 | -0.001210 | 0.001587 | 594.044639 | -1.343514e+18 | 0.000682 | 0.078208 | -4.275936 | 55.140167 | 0.578273 |
| std | 2.933903e+05 | 3.006470 | 4.122218 | 126.253496 | 11.720626 | 11.673811 | 13.694397 | 13.583830 | 18.221066 | 0.395722 | ... | 0.997241 | 0.191148 | 0.076994 | 538.927423 | 3.253993e+18 | 0.473894 | 0.268516 | 13.981723 | 8.214623 | 0.493867 |
| min | 4.006038e+08 | 2015.000000 | 1.000000 | -519.580000 | -66.000000 | -63.000000 | -66.000000 | -63.000000 | -86.000000 | -1.000000 | ... | -4.541810 | -1.311670 | -0.331206 | 2.562420 | -9.223370e+18 | -1.000000 | 0.000000 | -54.500000 | 27.500000 | 0.000000 |
| 25% | 4.009375e+08 | 2017.000000 | 4.000000 | -67.995000 | -7.500000 | -7.666670 | -9.333330 | -9.000000 | -11.333300 | -0.250000 | ... | -0.630453 | -0.114764 | -0.046310 | 244.819500 | -5.184000e+14 | -0.333333 | 0.000000 | -13.500000 | 49.000000 | 0.000000 |
| 50% | 4.012072e+08 | 2020.000000 | 7.000000 | 9.430000 | 0.000000 | -0.250000 | 0.000000 | -0.333333 | 0.166667 | 0.000000 | ... | -0.019434 | 0.000949 | 0.001684 | 448.965000 | 0.000000e+00 | 0.000000 | 0.000000 | -3.500000 | 54.500000 | 1.000000 |
| 75% | 4.014264e+08 | 2022.000000 | 11.000000 | 86.955000 | 7.714290 | 7.285710 | 9.000000 | 9.000000 | 11.833300 | 0.266667 | ... | 0.601632 | 0.113740 | 0.049644 | 770.691000 | 0.000000e+00 | 0.333333 | 0.000000 | 5.000000 | 60.500000 | 1.000000 |
| max | 4.017628e+08 | 2025.000000 | 16.000000 | 567.320000 | 59.000000 | 53.333300 | 59.000000 | 53.333300 | 90.000000 | 1.000000 | ... | 6.387010 | 1.099930 | 0.320819 | 5014.590000 | 1.088640e+16 | 1.000000 | 1.000000 | 54.000000 | 90.000000 | 1.000000 |
8 rows × 21 columns
In [6]:
#Missing Values
matchup_df.isnull().sum()
Out[6]:
game_id 0 season 0 week 0 elo_diff 0 points_per_game_diff 1125 points_allowed_per_game_diff 1125 recent_points_per_game_diff 1125 recent_points_allowed_per_game_diff 1125 margin_of_victory_diff 1125 win_rate_diff 1125 yards_per_play_diff 1226 yards_allowed_per_play_diff 1226 explosiveness_diff 1226 success_rate_diff 1226 travel_distance 0 rest_days_diff 0 recent_form_diff 1125 neutral_site 0 vegas_spread_close 31 vegas_over_under_close 75 home_win 0 dtype: int64
In [7]:
sns.heatmap(matchup_df.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()
In [8]:
#Outlier detection
numeric_cols = matchup_df.select_dtypes(include=[np.number]).columns
scaled = pd.DataFrame(StandardScaler().fit_transform(matchup_df[numeric_cols]), columns=numeric_cols)
scaled.boxplot(figsize=(15,6))
plt.xticks(rotation=90)
plt.title("Boxplot of Standardized Features")
plt.show()
In [99]:
numeric_cols = matchup_df.select_dtypes(include=[np.number]).columns
cols = 3 # Number of columns in the grid
rows = int(np.ceil(len(numeric_cols) / cols))
fig, axes = plt.subplots(rows, cols, figsize=(6 * cols, 4 * rows))
axes = axes.flatten()
for idx, col in enumerate(numeric_cols):
# Highlight outliers for this column
outlier_mask = np.abs(matchup_df[col]) > matchup_df[col].quantile(0.99)
sns.scatterplot(
x=matchup_df[col],
y=matchup_df['elo_diff'],
hue=outlier_mask,
ax=axes[idx],
palette={True: "red", False: "blue"},
legend=False
)
axes[idx].set_title(f"{col} vs elo_diff")
axes[idx].set_xlabel(col)
axes[idx].set_ylabel("elo_diff")
# Hide any unused subplots
for idx in range(len(numeric_cols), len(axes)):
axes[idx].set_visible(False)
fig.suptitle("Scatterplots of Numeric Features vs. Elo Diff (Outliers Highlighted)", fontsize=18, y=1.02)
plt.tight_layout()
plt.show()
In [9]:
#Correlation analysis
corr = matchup_df.corr(numeric_only=True)
plt.figure(figsize=(16,12)) # Larger figure for clarity
sns.heatmap(
corr,
annot=True,
cmap='coolwarm',
annot_kws={"size": 10} # Adjust annotation font size
)
plt.title("Correlation Heatmap of Numeric Features", fontsize=18)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.show()
In [11]:
#Target Variable Distribution
sns.histplot(matchup_df['home_win'].astype(float), bins=2)
plt.title('Target Distribution')
plt.xlabel('Home Loss (0) Home Win (1)')
plt.show()
In [12]:
#Check how many games were played at neutral sites
print(matchup_df[matchup_df['neutral_site'] == 1].shape[0])
604
In [20]:
#Since those games will have erroneous travel distances, drop those games
matchup_df = matchup_df[matchup_df['neutral_site'] == 0]
matchup_df = matchup_df.drop(columns=['neutral_site'])
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) File /opt/anaconda3/envs/collegeFootball/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key) 3811 try: -> 3812 return self._engine.get_loc(casted_key) 3813 except KeyError as err: File pandas/_libs/index.pyx:167, in pandas._libs.index.IndexEngine.get_loc() File pandas/_libs/index.pyx:196, in pandas._libs.index.IndexEngine.get_loc() File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item() File pandas/_libs/hashtable_class_helper.pxi:7096, in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'neutral_site' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) Cell In[20], line 2 1 #Since those games will have erroneous travel distances, drop those games ----> 2 matchup_df = matchup_df[matchup_df['neutral_site'] == 0] 3 matchup_df = matchup_df.drop(columns=['neutral_site']) File /opt/anaconda3/envs/collegeFootball/lib/python3.12/site-packages/pandas/core/frame.py:4107, in DataFrame.__getitem__(self, key) 4105 if self.columns.nlevels > 1: 4106 return self._getitem_multilevel(key) -> 4107 indexer = self.columns.get_loc(key) 4108 if is_integer(indexer): 4109 indexer = [indexer] File /opt/anaconda3/envs/collegeFootball/lib/python3.12/site-packages/pandas/core/indexes/base.py:3819, in Index.get_loc(self, key) 3814 if isinstance(casted_key, slice) or ( 3815 isinstance(casted_key, abc.Iterable) 3816 and any(isinstance(x, slice) for x in casted_key) 3817 ): 3818 raise InvalidIndexError(key) -> 3819 raise KeyError(key) from err 3820 except TypeError: 3821 # If we have a listlike key, _check_indexing_error will raise 3822 # InvalidIndexError. Otherwise we fall through and re-raise 3823 # the TypeError. 3824 self._check_indexing_error(key) KeyError: 'neutral_site'
In [21]:
#Drop rows with missing values
matchup_df = matchup_df.dropna()
In [22]:
#Define features and target
X = matchup_df.drop(columns=['home_win', 'game_id', 'season', 'week']).copy()
y = matchup_df['home_win'].astype(float).copy()
In [44]:
# Pairplot for selected features and target
pairplot_df = matchup_df.drop(columns=['game_id', 'season', 'week']).copy()
sns.pairplot(pairplot_df, hue='home_win', diag_kind='kde')
plt.suptitle('Pairplot of Key Features', y=1.02)
plt.show()